Goal: Top nodes and the nodes they transact with


In [1]:
from IPython.display import display, clear_output
import time
import sys

import logging
import collections
import networkx as nx
import matplotlib

import config
import utils
import graph_analyzer

logging.basicConfig(level=logging.INFO)

EMPTY_OUTPUT = -1

def log(msg):
    print msg
    sys.stdout.flush()

In [72]:
COIN_CODE = "PPC"
N_CLUSTERS = 10
CSV_FILE = config.data_dir + "csv_exports/edges-%s.csv" % COIN_CODE
INPUT_COUNTS_FILE = config.data_dir + "txin/txin-%s.csv" % COIN_CODE
OUTPUT_COUNTS_FILE = config.data_dir + "txout/txout-%s.csv" % COIN_CODE

In [73]:
log("Generating graph analyzer")
analyzer = graph_analyzer.GraphAnalyzer(COIN_CODE)


Generating graph analyzer

In [74]:
log("Calculating input and output counts")
input_counts = collections.defaultdict(int)
output_counts = collections.defaultdict(int)
for row in utils.get_csvreader(INPUT_COUNTS_FILE):
    if len(row) > 1:
        input_counts[int(row[0])] = int(row[1])
for row in utils.get_csvreader(OUTPUT_COUNTS_FILE):
    if len(row) > 1:
        output_counts[int(row[0])] = int(row[1])


Calculating input and output counts

In [75]:
log("Determining top clusters")
top_clusters_data = analyzer.richest_n_clusters(N_CLUSTERS) # top cluster, value pairs
top_clusters = [x[0] for x in top_clusters_data]


Determining top clusters

In [76]:
log("Creating clusters data structure")
clusters = collections.defaultdict(int) # key is cluster_id, value is cluster value at latest time
for cluster_id, value in top_clusters_data:
    clusters[cluster_id] = value


Creating clusters data structure

In [77]:
log("Adding neighboring clusters and calculating edge weights")

# dictionary with (send_cluster_id, recv_cluster_id) as keys,
# and total value transferred as values
edge_weights = collections.defaultdict(int)

def update_edge_weights(tx_inputs, tx_outputs, edge_weights):
    # correct for multiple counts
    input_set = set(tx_inputs.keys())
    output_set = set(tx_outputs.keys())

    for pubkey in tx_inputs:
        tx_inputs[pubkey] /= output_counts[last_tx]
    for pubkey in tx_outputs:
        tx_outputs[pubkey] /= input_counts[last_tx]

    # attempt to determine a sender cluster_id
    sender_cluster_id = analyzer.cluster_for_pubkey(tx_inputs.keys()[0])

    if not sender_cluster_id:
        return

    only_output = output_set.difference(input_set)
    for pubkey in only_output:
        recv_cluster_id = analyzer.cluster_for_pubkey(pubkey)

        if recv_cluster_id \
            and (sender_cluster_id != recv_cluster_id) \
            and ((sender_cluster_id in top_clusters) or (recv_cluster_id in top_clusters)):
            
            if sender_cluster_id not in clusters:
                clusters[sender_cluster_id] = analyzer.balance_for_cluster(sender_cluster_id)
                
            if recv_cluster_id not in clusters:
                clusters[recv_cluster_id] = analyzer.balance_for_cluster(recv_cluster_id)

            edge_weights[(sender_cluster_id, recv_cluster_id)] += tx_outputs[pubkey]

last_tx = None
tx_inputs = collections.defaultdict(int)
tx_outputs = collections.defaultdict(int)
counter = 0
for row in utils.get_csvreader(CSV_FILE):
    if len(row) < 2:
        continue

    if counter % 10000 == 0:
        clear_output(wait=True)
        print(counter)
        sys.stdout.flush()

    tx_id = int(row[0])

    # detect coingen
    try:
        pubkey_input = int(row[1])
        value_input = int(row[2])
    except:
        pubkey_input = config.COINGEN_ADDRESS
        value_input = 0

    try:
        pubkey_output = int(row[3])
    except:
        pubkey_output = EMPTY_OUTPUT
    value_output = int(row[4])

    if tx_id != last_tx and tx_inputs:
        # end of reading a transaction series. Update edges
        update_edge_weights(tx_inputs, tx_outputs, edge_weights)

        tx_inputs = collections.defaultdict(int)
        tx_outputs = collections.defaultdict(int)

    last_tx = tx_id
    tx_inputs[pubkey_input] += value_input
    if pubkey_output != EMPTY_OUTPUT:
        tx_outputs[pubkey_output] += value_output

    counter += 1

update_edge_weights(tx_inputs, tx_outputs, edge_weights)


2100000

In [78]:
G = nx.DiGraph()

log("Adding nodes")
G.add_nodes_from(clusters.keys())

log("Adding edges")
counter = 0
for key in edge_weights:
    
    if counter % 1000 == 0:
        clear_output(wait=True)
        print(counter)
        sys.stdout.flush()
    
    try:
        G.add_weighted_edges_from([(key[0], key[1], edge_weights[key])])
    except:
        pass
    
    counter += 1

log("Calculating node sizes")
node_sizes = []
max_value = max(clusters.values())
for value in clusters.values():
    node_sizes.append(int((value/float(max_value)) * 10000))

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(20,20)

log("Calculating layout")
pos = nx.pygraphviz_layout(G)

# only print connected nodes
connected_nodes = {}
for sender, recvr in edge_weights:
    connected_nodes[sender] = True
    connected_nodes[recvr] = True

log("Drawing")
nx.draw_networkx(G, pos=pos, with_labels=False, nodelist=connected_nodes.keys(), node_size=node_sizes, alpha=0.8, edgelist=[])

log("Drawing edges")
max_edge_weight = max(edge_weights.values())
edge_weights_scaled = {}
counter = 0
for key in edge_weights:
    
    if counter % 100 == 0:
        clear_output(wait=True)
        print(counter)
        sys.stdout.flush()
        
    nx.draw_networkx_edges(G, pos, edgelist=[key], width=int((edge_weights[key]/float(max_edge_weight)) * 100), alpha=0.4, arrows=False)

    counter += 1
    
fig.savefig('graph.png',dpi=400)


18300

In [ ]: